1 The caret Package

The caret package has functions to simplify just about everything in data science/ machine learning.

1.2 Installation

Install with:

if( !("caret" %in% installed.packages()) ) {
    install.packages("caret", dependencies = c("Depends", "Suggests"), repos = "http://cran.us.r-project.org");
}
library("caret");

1.3 Useful Caret Functions

Preprocessing

preProcess(x, method = c("center", "scale"), thresh = 0.95,pcaComp = NULL,na.remove = TRUE,k = 5,knnSummary = mean,outcome = NULL,fudge = .2,numUnique = 3,verbose = FALSE,...)

Segmenting Data

createDataPartition(y, times = 1, p = 0.5,list = TRUE, groups = min(5, length(y)))
createResample(y, times = 10, list = TRUE)
createFolds(y, k = 10, list = TRUE, returnTrain = FALSE)
createMultiFolds(y, k = 10, times = 5)
createTimeSlices(y, initialWindow, horizon = 1, fixedWindow = TRUE, skip = 0)

Training and Testing

train(x, y, method = "rf", preProcess = NULL, ..., weights = NULL, metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric %in% c("RMSE", "logLoss"), FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL, tuneLength = 3)
predict (object, ...)

Comparing Models

confusionMatrix(data, reference, positive = NULL, dnn = c("Prediction", "Reference"), prevalence = NULL, mode = "sens_spec", ...)

All possible models are trained the same way – by passing a string to caret::train(..., method='string', ...). The full list of method strings is below. For details see http://topepo.github.io/caret/using-your-own-model-in-train.html

names(getModelInfo())
##   [1] "ada"                 "AdaBag"              "AdaBoost.M1"        
##   [4] "adaboost"            "amdai"               "ANFIS"              
##   [7] "avNNet"              "awnb"                "awtan"              
##  [10] "bag"                 "bagEarth"            "bagEarthGCV"        
##  [13] "bagFDA"              "bagFDAGCV"           "bam"                
##  [16] "bartMachine"         "bayesglm"            "bdk"                
##  [19] "binda"               "blackboost"          "blasso"             
##  [22] "blassoAveraged"      "Boruta"              "bridge"             
##  [25] "brnn"                "BstLm"               "bstSm"              
##  [28] "bstTree"             "C5.0"                "C5.0Cost"           
##  [31] "C5.0Rules"           "C5.0Tree"            "cforest"            
##  [34] "chaid"               "CSimca"              "ctree"              
##  [37] "ctree2"              "cubist"              "dda"                
##  [40] "deepboost"           "DENFIS"              "dnn"                
##  [43] "dwdLinear"           "dwdPoly"             "dwdRadial"          
##  [46] "earth"               "elm"                 "enet"               
##  [49] "enpls.fs"            "enpls"               "evtree"             
##  [52] "extraTrees"          "fda"                 "FH.GBML"            
##  [55] "FIR.DM"              "foba"                "FRBCS.CHI"          
##  [58] "FRBCS.W"             "FS.HGD"              "gam"                
##  [61] "gamboost"            "gamLoess"            "gamSpline"          
##  [64] "gaussprLinear"       "gaussprPoly"         "gaussprRadial"      
##  [67] "gbm"                 "gcvEarth"            "GFS.FR.MOGUL"       
##  [70] "GFS.GCCL"            "GFS.LT.RS"           "GFS.THRIFT"         
##  [73] "glm"                 "glmboost"            "glmnet"             
##  [76] "glmStepAIC"          "gpls"                "hda"                
##  [79] "hdda"                "hdrda"               "HYFIS"              
##  [82] "icr"                 "J48"                 "JRip"               
##  [85] "kernelpls"           "kknn"                "knn"                
##  [88] "krlsPoly"            "krlsRadial"          "lars"               
##  [91] "lars2"               "lasso"               "lda"                
##  [94] "lda2"                "leapBackward"        "leapForward"        
##  [97] "leapSeq"             "Linda"               "lm"                 
## [100] "lmStepAIC"           "LMT"                 "loclda"             
## [103] "logicBag"            "LogitBoost"          "logreg"             
## [106] "lssvmLinear"         "lssvmPoly"           "lssvmRadial"        
## [109] "lvq"                 "M5"                  "M5Rules"            
## [112] "manb"                "mda"                 "Mlda"               
## [115] "mlp"                 "mlpML"               "mlpSGD"             
## [118] "mlpWeightDecay"      "mlpWeightDecayML"    "multinom"           
## [121] "nb"                  "nbDiscrete"          "nbSearch"           
## [124] "neuralnet"           "nnet"                "nnls"               
## [127] "nodeHarvest"         "oblique.tree"        "OneR"               
## [130] "ordinalNet"          "ORFlog"              "ORFpls"             
## [133] "ORFridge"            "ORFsvm"              "ownn"               
## [136] "pam"                 "parRF"               "PART"               
## [139] "partDSA"             "pcaNNet"             "pcr"                
## [142] "pda"                 "pda2"                "penalized"          
## [145] "PenalizedLDA"        "plr"                 "pls"                
## [148] "plsRglm"             "polr"                "ppr"                
## [151] "protoclass"          "pythonKnnReg"        "qda"                
## [154] "QdaCov"              "qrf"                 "qrnn"               
## [157] "randomGLM"           "ranger"              "rbf"                
## [160] "rbfDDA"              "Rborist"             "rda"                
## [163] "relaxo"              "rf"                  "rFerns"             
## [166] "RFlda"               "rfRules"             "ridge"              
## [169] "rlda"                "rlm"                 "rmda"               
## [172] "rocc"                "rotationForest"      "rotationForestCp"   
## [175] "rpart"               "rpart1SE"            "rpart2"             
## [178] "rpartCost"           "rpartScore"          "rqlasso"            
## [181] "rqnc"                "RRF"                 "RRFglobal"          
## [184] "rrlda"               "RSimca"              "rvmLinear"          
## [187] "rvmPoly"             "rvmRadial"           "SBC"                
## [190] "sda"                 "sddaLDA"             "sddaQDA"            
## [193] "sdwd"                "simpls"              "SLAVE"              
## [196] "slda"                "smda"                "snn"                
## [199] "sparseLDA"           "spikeslab"           "spls"               
## [202] "stepLDA"             "stepQDA"             "superpc"            
## [205] "svmBoundrangeString" "svmExpoString"       "svmLinear"          
## [208] "svmLinear2"          "svmLinear3"          "svmLinearWeights"   
## [211] "svmLinearWeights2"   "svmPoly"             "svmRadial"          
## [214] "svmRadialCost"       "svmRadialSigma"      "svmRadialWeights"   
## [217] "svmSpectrumString"   "tan"                 "tanSearch"          
## [220] "treebag"             "vbmpRadial"          "vglmAdjCat"         
## [223] "vglmContRatio"       "vglmCumulative"      "widekernelpls"      
## [226] "WM"                  "wsrf"                "xgbLinear"          
## [229] "xgbTree"             "xyf"

1.3.1 Example: Creating Training Sets

Use caret::createDataPartition( y=srcData, times=1, p=0.5, list=TRUE, groups=min(5, length(y)) ):

library(caret); library(kernlab); data(spam);

# creates a numeric vector that can subset from spam
inTrain<-createDataPartition(y=spam$type, p=0.75, list=FALSE)

training <- spam[inTrain, ]

# subsets rows NOT indexed by inTrain
testing <- spam[-inTrain, ]

dim(inTrain)
## [1] 3451    1
# The training set
head( spam[inTrain, 1:8 ]); 
##   make address  all num3d  our over remove internet
## 1 0.00    0.64 0.64     0 0.32 0.00   0.00     0.00
## 2 0.21    0.28 0.50     0 0.14 0.28   0.21     0.07
## 4 0.00    0.00 0.00     0 0.63 0.00   0.31     0.63
## 6 0.00    0.00 0.00     0 1.85 0.00   0.00     1.85
## 7 0.00    0.00 0.00     0 1.92 0.00   0.00     0.00
## 9 0.15    0.00 0.46     0 0.61 0.00   0.30     0.00
# The test set
head(spam[-inTrain, 1:8]);
##    make address  all num3d  our over remove internet
## 3  0.06    0.00 0.71     0 1.23 0.19   0.19     0.12
## 5  0.00    0.00 0.00     0 0.63 0.00   0.31     0.63
## 8  0.00    0.00 0.00     0 1.88 0.00   0.00     1.88
## 15 0.00    0.00 1.42     0 0.71 0.35   0.00     0.35
## 16 0.00    0.42 0.42     0 1.27 0.00   0.42     0.00
## 21 0.00    0.00 0.00     0 0.00 0.00   0.00     0.00

Note the row numbers

1.3.2 Example: Fitting a Model

set.seed(32343)

# trains a generalized linear model
modelFit <- train(type ~., data = training, method = 'glm')

# view summary of model
modelFit
## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9198818  0.8317119
## 
## 
# view parameters (weights/theta) of final model
modelFit$finalModel
## 
## Call:  NULL
## 
## Coefficients:
##       (Intercept)               make            address  
##        -1.617e+00         -5.494e-01         -1.528e-01  
##               all              num3d                our  
##         1.429e-01          3.733e+00          5.066e-01  
##              over             remove           internet  
##         9.540e-01          1.883e+00          1.234e+00  
##             order               mail            receive  
##         1.109e+00          1.423e-01          2.384e-01  
##              will             people             report  
##        -1.205e-01         -2.180e-01          7.444e-02  
##         addresses               free           business  
##         9.789e-01          9.337e-01          1.328e+00  
##             email                you             credit  
##         9.268e-02          1.011e-01          1.426e+00  
##              your               font             num000  
##         1.703e-01          1.655e-01          1.756e+00  
##             money                 hp                hpl  
##         3.088e-01         -2.713e+00         -6.414e-01  
##            george             num650                lab  
##        -2.036e+01          5.900e-01         -2.029e+00  
##              labs             telnet             num857  
##        -3.933e-01         -9.324e-02          1.108e+00  
##              data             num415              num85  
##        -1.068e+00          1.283e+00         -1.629e+00  
##        technology            num1999              parts  
##         1.210e+00          8.426e-02          1.833e+00  
##                pm             direct                 cs  
##        -7.681e-01         -3.524e-01         -5.684e+02  
##           meeting           original            project  
##        -3.583e+00         -1.929e+00         -1.578e+00  
##                re                edu              table  
##        -9.668e-01         -1.758e+00         -2.806e+00  
##        conference      charSemicolon   charRoundbracket  
##        -3.374e+00         -1.325e+00         -7.183e-01  
## charSquarebracket    charExclamation         charDollar  
##        -5.464e-01          3.031e-01          5.312e+00  
##          charHash         capitalAve        capitalLong  
##         2.525e+00          7.188e-02          7.347e-03  
##      capitalTotal  
##         1.131e-03  
## 
## Degrees of Freedom: 3450 Total (i.e. Null);  3393 Residual
## Null Deviance:       4628 
## Residual Deviance: 1288  AIC: 1404

1.3.3 Example: Predicting with the Model

predictions <- predict(object = modelFit, newdata = testing)

# produces a factor vector of class labels, 'spam' or 'nonspam'
predictions[1:16]
##  [1] spam    spam    spam    spam    spam    nonspam spam    nonspam
##  [9] spam    nonspam nonspam nonspam spam    spam    spam    spam   
## Levels: nonspam spam

1.3.4 Example: Evaluating a Model

confusionMatrix(data = predictions, reference = testing$type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     662   56
##    spam         35  397
##                                           
##                Accuracy : 0.9209          
##                  95% CI : (0.9037, 0.9358)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8329          
##  Mcnemar's Test P-Value : 0.03603         
##                                           
##             Sensitivity : 0.9498          
##             Specificity : 0.8764          
##          Pos Pred Value : 0.9220          
##          Neg Pred Value : 0.9190          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5757          
##    Detection Prevalence : 0.6243          
##       Balanced Accuracy : 0.9131          
##                                           
##        'Positive' Class : nonspam         
## 

1.4 Caret Prediction Algorithms

  • Linear Discriminant Analysis
  • Regression
  • Naive Bays
  • SVM
  • Classification & Regression Trees
  • Random Forest
  • Boosting
  • Etc

2 Data Slicing

2.0.1 Example: Random Subsampling

library(caret); library(kernlab); data(spam);

# creates a numeric vector that can subset from spam
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)

training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
dim(training); dim(testing)
## [1] 3451   58
## [1] 1150   58

2.0.2 Example: K-Fold Cross Val

  • k = the number of folds
  • returnTrain= specifies whether the folds are training or test folds
    • values = {TRUE, FALSE}
  • list= determines the return structure data type
    • values = {list, vector, matrix}
set.seed(32323)

# split data into 10 groups with random dropout in each
# returns a list of folds
# each fold is a numeric vector of row nums from spam$type
folds <- createFolds(y=spam$type, k=10, list=TRUE, returnTrain=TRUE) # vs returnTrain=FALSE for test set

# display num of examples in each fold, source had 4601 rows
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 
##   4141   4140   4141   4142   4140   4142   4141   4141   4140   4141
# view row dropout in various folds
folds[[1]][1:20]; folds[[2]][1:20]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  [1]  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22
# use folds to index data sets via
trn2 <- spam$type[folds[[2]]]]
val2 <- spam$type[-folds[[2]]]

or

tst <- createFolds(y=spam$type, k=10, list=TRUE, returnTrain=FALSE) # vs returnTrain=TRUE for training set

Return the training set via something like df[-tst, :]

2.0.3 Example: Resampling

set.seed(32323)

# fill each fold with random sample (with replacement) of rows
folds <- createResample(y = spam$type, times = 10, list = TRUE)

# view sizes of resampled folds, note the random collection of rows
sapply(folds, length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06 
##       4601       4601       4601       4601       4601       4601 
## Resample07 Resample08 Resample09 Resample10 
##       4601       4601       4601       4601
# note the repeated rows
folds[[1]][1:20]
##  [1]  1  2  3  3  3  5  5  7  8 12 12 12 13 13 13 14 14 15 16 16

2.0.4 Example: Time Series

  • initialWindow= rows in the training set
  • horizon= the next \(n\) continuous rows (test set)
set.seed(32323)

tme <- 1:1000

folds <- createTimeSlices(y = tme, initialWindow = 20, horizon = 10)

# what are the factors in each of our time slices?
names(folds)
## [1] "train" "test"

3 Training Options

Recall the generalized linear model trained on the spam dataset:

library(caret); library(kernlab); data(spam);

inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)

training <- spam[inTrain, ]
testing <- spam[-inTrain, ]

modelFit <- train(type ~., data = training, method = 'glm')

The caret::train() has a number of options to control how training is done. (View any default method parameters with base::args(functionName.default))

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
args(train.default)
## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL, 
##     metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric %in% 
##         c("RMSE", "logLoss"), FALSE, TRUE), trControl = trainControl(), 
##     tuneGrid = NULL, tuneLength = 3) 
## NULL
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method), 
##     10, 25), repeats = ifelse(grepl("cv", method), 1, number), 
##     p = 0.75, search = "grid", initialWindow = NULL, horizon = 1, 
##     fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE, 
##     returnResamp = "final", savePredictions = FALSE, classProbs = FALSE, 
##     summaryFunction = defaultSummary, selectionFunction = "best", 
##     preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5), 
##     sampling = NULL, index = NULL, indexOut = NULL, indexFinal = NULL, 
##     timingSamps = 0, predictionBounds = rep(FALSE, 2), seeds = NA, 
##     adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE), 
##     trim = FALSE, allowParallel = TRUE) 
## NULL

4 Plotting Predictors

Plots are your friends. Git gud at them. For this section, we use example data from ISLR Package, the companion to the book Introduction to Statistical Learning with R.

Install with:

if( !("ISLR" %in% installed.packages()) ) {
    install.packages("ISLR", repos = "http://cran.us.r-project.org");
}

4.0.1 Example: Wage Data

library("ISLR"); library(ggplot2);

data(Wage)

summary(Wage)
##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins      logwage     
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083   Min.   :3.000  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917   1st Qu.:4.447  
##                                                            Median :4.653  
##                                                            Mean   :4.654  
##                                                            3rd Qu.:4.857  
##                                                            Max.   :5.763  
##                                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 

Set aside a test set (and validation set) before beginning exploratory analysis.

inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)

training <- Wage[inTrain,]
testing <- Wage[-inTrain,]

dim(training); dim(testing)
## [1] 2102   12
## [1] 898  12

4.0.2 Example: Lattice Plot a Subset of Wage Features

Using caret::featurePlot()

# choose 3 features
mySet <- training[, c('age', 'education', 'jobclass')]

# plot wage as a function of features (4x4 = 16)
featurePlot(x=mySet, y=training$wage, plot='pairs')

4.0.3 Example: Scatter Plot Wage vs Age

ggplot(data=training) + aes(x=age, y=wage) + geom_point() 

Note the natural vertical segregation. How to investigate this further?

Answer: color the scatter by an additional feature:

ggplot(data=training) + aes(x=age, y=wage, color=jobclass) + geom_point() 

Very few industrial jobs are in the upper group.

4.0.4 Example: Plot with Feature Regressions

It is easy to do simple regressions inside a plot, for easy viewing. Oddly, it’s actually very difficult to do your own regression, then add that to a plot. Proceed accordingly!

g <- ggplot(data=training) + aes(x=age, y=wage, color=education) + geom_point() 

# add linear regression: wage = f(age)
g <- g + geom_smooth(method = "lm", formula = y~x)
print(g)

Regression lines have been added to plot

4.0.5 Example: Plot by Quantiles

The method here is to pseudo-manually cut the data with Hmisc::cut2(), then plot the resulting quantiles.

if( !("Hmisc" %in% installed.packages()) ) {
    install.packages("Hmisc", repos = "http://cran.us.r-project.org");
}
library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
cutWage <- cut2(training$wage, g=3)
table(cutWage)
## cutWage
## [ 20.1, 93.1) [ 93.1,118.9) [118.9,314.3] 
##           713           708           681
t1 <- table(cutWage, training$jobclass)
t1
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 93.1)           449            264
##   [ 93.1,118.9)           361            347
##   [118.9,314.3]           275            406
# a proportional table
prop.table(x = t1, margin = 1)  # margin=dimension
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 93.1)     0.6297335      0.3702665
##   [ 93.1,118.9)     0.5098870      0.4901130
##   [118.9,314.3]     0.4038179      0.5961821

Plot this table to look for trends

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Hmisc':
## 
##     combine
g1 <- ggplot(data=training) + aes(x=cutWage, y=age, fill=cutWage) + geom_boxplot()

# view points on top of boxplot
g2 <- ggplot(data=training) + aes(x=cutWage, y=age, fill=cutWage) + geom_boxplot() + geom_jitter()
grid.arrange(g1,g2, ncol=2)

4.0.6 Example: A Density Plot of Wage by Quantile

For continuous predictors

ggplot(data=training) + aes(x=cutWage, color=education) + geom_density()

All of the above are ways to examine properties of your data set, but don’t use your test set for exploration.

You’re always looking for

  • trends
  • skewed variables
  • imbalance in outcomes/predictors
  • outcomes not explained by predictors

5 Basic Preprocessing

After plotting the variables up front, it’s time to prep the data set for training.

Consider the kernlab::spam data set one more time:

library(caret); library(kernlab); data(spam);

# creates a numeric vector that can subset from spam
inTrain<-createDataPartition(y=spam$type, p=0.75, list=FALSE)

training <- spam[inTrain, ]
testing <- spam[-inTrain, ]

hist(training$capitalAve, main="", xlab="ave. capital run length")

This is the distribution of capital letter sequence length between spam/nonspam. This variable is highly skewed, and will bone most learning algorithms. We can redress this by shifting/stretching/squashing the input data and writing the result to an R::caret Preprocess Object. That object becomes the training inputs to a hypothesis function, and we pass it to the caret::train(type= ,data=preObj, method="string") and caret::predict(object=preObj, newdata=groundTruth) functions.

  1. Preprocess an input set with preObj <- preProcess(object = inputSet , method = "" )
  2. Pass the Preprocess Object to another caret function: predict(object = preObj, newdata = groundTruth)

5.1 The Statistical Z-Transform

One way to fix model-boning is by doing a statistical z-transform on the data:

testCapAve <- testing$capitalAve

# z-transform based on training set distribution
testCapAveS <- (testCapAve - mean(training$capitalAve)) / sd(training$capitalAve)

hist(testCapAveS, xlab="ave capital run length deviation")

Another way to do this is with the caret::preProcess() function:

# col 58 is the ground-truth outcome
preObj <- preProcess( training[,-58], method=c("center", "scale") )

# pass the Preprocess object into the predict() method
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
hist(testCapAveS, xlab="ave capital run length deviation")

To z-transform the corresponding test set, you would create a Preprocess Object from the training set, then pass that object to predict(), along with the testing set:

preObj <- preProcess( training[,-58], method=c("center", "scale") )

# must pass subset of same dims to preProcess() and predict(): ex train[-58] vs test[-58]
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
testCapAveS <- predict(preObj, testing[,-58])$capitalAve

You could also pass the preProcess() function as an argument to train():

set.seed(32343)

modelFit <- train(type ~., data=training, method="glm", preProcess=c("center", "scale"))
modelFit
## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9116024  0.8163398
## 
## 

5.2 Box-Cox Transform

Other preprocessing transforms are avaialable. The Box-Cox is a power transform takes any continuous variable and forces it to fit a normal distribution by introducing a new parameter \(\lambda\)

\[ \mbox{for } y \geq 0: ~~ y(\lambda) = \left\{\begin{matrix} \frac{ y^{\lambda}-1 }{\lambda}, & \mbox{if } \lambda \neq 0\\ log(y), & \mbox{if } \lambda = 0 \end{matrix}\right. \]

preObj <- preProcess(training[,-58], method=c("BoxCox"))

trainCapAveS <- predict(object = preObj, newdata = training[,-58])$capitalAve

par(mfrow=c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS)

5.3 Imputing Data

Imputing is the process of assigning values to NAs in the data set. Imputing may use any number of interpolation methods. A common method is k nearest neighbors, which averages the values surrounding each NA.

This is super easy, just add "knnImpute" to preProcess(..., method="..."):

set.seed(13343)

# introduce NAs
training$capAve <- training$capitalAve  # copy a column
selectNA <- rbinom(dim(training)[1], size=1, prob=.5) == 1  # make a boolean vector
training$capAve[selectNA] <- NA  # overwrite some values with NA

# Impute and standardize
preObj <- preProcess(training[,-58], method=c("knnImpute"))
capAve <- predict(preObj, training[,-58])$capAve

# Standardize the values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth)) / sd(capAveTruth)

quantile(capAve - capAveTruth)
##           0%          25%          50%          75%         100% 
## -6.552565113 -0.004786531  0.007954623  0.014351073  0.716373528
quantile((capAve - capAveTruth)[selectNA])
##          0%         25%         50%         75%        100% 
## -6.26513752 -0.01168132  0.01008928  0.02320144  0.71637353
quantile((capAve - capAveTruth)[!selectNA])
##           0%          25%          50%          75%         100% 
## -6.552565113 -0.001711543  0.007131843  0.011186118  0.014763419

Be mindful that if you preprocess training data, you cannot preprocess test data when predicting; you must reuse the preprocessed training data. The reason is that any transformation creates new features, and creating new features on a test set will skew the predictions in unpredictable ways.

6 Creating Covariates

A covariate is yet another name for a model feature. In the real world, not all data variables make it into the model. Covariates (or features, or predictors) have the implicit meaning that they may have been subset from a larger group, and are intended for model inclusion.

Covariates are created in 2 levels, which sort of correspond to in-model and out-model. The goal is to include the most data in the fewest number of features (e.g. maximize data density).

6.1 Discussion

Creating covariates is delicate balancing act between information density (summarization) and information loss. It requires substantial application knowledge and domain expertise. Including more features tends toward overfitting (high bias), which can be corrected by regularization.

  • Level 1
    • When in doubt, err on the side of more features
    • Can be automated, but don’t do it; it rarely works
  • Level 2
    • only done on training sets
    • often necessary for regressions and SVMs
    • rarely needed for classification trees
    • achived through exploratory analysis
    • new covariates should be added to data frames
    • use practical variable names

6.2 Dummy Variables

Dummy Variables are a stupid way to describe the process of converting factor levels into feature variables. If there is a factor variable with 2 levels, create a new feature for each level. Each of those features will have a boolean value for each example. This process expands the data space, widens the data set, and is also known as one-hot encoding.

library(ISLR); library(caret); data(Wage);

inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]

Explore the data

table(training$jobclass)
## 
##  1. Industrial 2. Information 
##           1051           1051

Create 2 new variables for these classes with caret::dummyVars(formula = , data = , ...)

dummies <- dummyVars(formula = wage~jobclass, data = training)

str(dummies)
## List of 9
##  $ call      : language dummyVars.default(formula = wage ~ jobclass, data = training)
##  $ form      :Class 'formula'  language wage ~ jobclass
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ vars      : chr [1:2] "wage" "jobclass"
##  $ facVars   : chr "jobclass"
##  $ lvls      :List of 1
##   ..$ jobclass: chr [1:2] "1. Industrial" "2. Information"
##  $ sep       : chr "."
##  $ terms     :Classes 'terms', 'formula'  language wage ~ jobclass
##   .. ..- attr(*, "variables")= language list(wage, jobclass)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "wage" "jobclass"
##   .. .. .. ..$ : chr "jobclass"
##   .. ..- attr(*, "term.labels")= chr "jobclass"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(wage, jobclass)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:2] "wage" "jobclass"
##  $ levelsOnly: logi FALSE
##  $ fullRank  : logi FALSE
##  - attr(*, "class")= chr "dummyVars"
head(predict(object = dummies, newdata = training))
##        jobclass.1. Industrial jobclass.2. Information
## 86582                       0                       1
## 161300                      1                       0
## 155159                      0                       1
## 11443                       0                       1
## 376662                      0                       1
## 450601                      1                       0

The training set now has a boolean flag for each person and job class.

6.3 Maximizing Feature Variability

Some features, both qualitative, quantitative, and boolean, have very little variability, and won’t add much information to a model. Finding numeric features of this kind is easy with caret::nearZeroVar(x= , freqCut = 95/5, uniqueCut = 10) or caret::nzv()

For nearZeroVar(): if saveMetrics = FALSE, returns a vector of integers corresponding to the column positions of the problematic predictors. If saveMetrics = TRUE, a data frame with columns:

nsv <- nearZeroVar(x = training, saveMetrics = TRUE)
nsv
##            freqRatio percentUnique zeroVar   nzv
## year        1.037356    0.33301618   FALSE FALSE
## age         1.027027    2.85442436   FALSE FALSE
## sex         0.000000    0.04757374    TRUE  TRUE
## maritl      3.272931    0.23786870   FALSE FALSE
## race        8.938776    0.19029496   FALSE FALSE
## education   1.389002    0.23786870   FALSE FALSE
## region      0.000000    0.04757374    TRUE  TRUE
## jobclass    1.000000    0.09514748   FALSE FALSE
## health      2.468647    0.09514748   FALSE FALSE
## health_ins  2.352472    0.09514748   FALSE FALSE
## logwage     1.061728   19.17221694   FALSE FALSE
## wage        1.061728   19.17221694   FALSE FALSE
  • freqRatio: the ratio of frequencies for the most common value over the second most common value
  • percentUnique: the percentage of unique data points out of the total number of data points
  • zeroVar: a vector of logicals for whether the predictor has only one distinct value
  • nzv: a vector of logicals for whether the predictor is a near zero variance predictor

6.4 Creating Polynomial Covariates

Recall that creating polynomial features is a simple way to improve model fit. Again, this is super easy with the splines package splines::bs function.

6.4.1 Example: Create 2 New Features

From the training$age variable, we want to create \(\{age, age^2, age^3 \}\):

library(splines)

# uses B-splines with 3 dof
bsBasis <- bs(training$age, df=3)
head(bsBasis, 3L)
##              1          2           3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4163380 0.32117502 0.082587862
## [3,] 0.4308138 0.29109043 0.065560908

6.4.2 Example: Fitting Splines

For computational efficiency, in practice the modeling code should be separated from the plotting code.

lm1 <- lm(wage~bsBasis, data=training)

# the spline will be the y-arg of a geom_line()
splineLine <- predict(object = lm1, newdata = training)

g <- ggplot(data=training) + aes(x=age, y=wage) + geom_point() 

g <- g + geom_line(aes(x=age, y=splineLine), color='red', lwd=2)
print(g)

  • caret:gam() also works, but is more complicated

7 Preprocessing with Principal Components (PCA)

Principal Components Analysis will segment out variables that are highly correlated.

Once again, the kernlab::spam dataset:

library(caret); library(kernlab); data(spam);

# creates a numeric vector that can subset from spam
inTrain<-createDataPartition(y=spam$type, p=0.75, list=FALSE)

training <- spam[inTrain, ]
testing <- spam[-inTrain, ]

M <- abs(cor(x = training[,-58]))
diag(M) <- 0  # remove autocorrelations
which(M > 0.9, arr.ind=TRUE) # which vars are more than 90% correlated?
##        row col
## num415  34  32
## num857  32  34

Two features are highly correlated: num415, num857. Double check to be sure:

# index by col num if you don't have the names
plot(spam[,34], spam[,32])

A straight line with \(slope \approx 1\); this is almost perfect correlation. A weighted combination of these features might capture most of the variance while removing all the correlation. Extending this idea, we want to find the smallest matrix \(M\) (subset) that captures the most variability (best explains) the original set. This matrix can be found via SVD/PCA

7.1 The Singular Value Decomposition (SVD)

Takes an original matrix \(X\) and decomposes it into a product of 3 matrices such that

\[X = UDV^T\]

  • columns of \(U\) are orthogonal (left singular vectors)
  • \(D\) is a diagonal matrix (singular values)
  • columns of \(V\) are orthogonal (right singular vectors)
    • (in most real data sets, if \(X\) is factorable, then \(U = V\))

7.2 The Principal Components

The Principal Components of \(X\) capture the most variability of \(X\) in descending cardinality order. Conveniently, if the input variables have been scaled and mean-shifted (z-transformed), the principal components are equal to the columns of \(V\) (often also \(U\)).

Principal Components are very easy to find with stats::prcomp()

7.2.1 Example: PCA of Correlated Variables

From the previous example, columns 32 and 32 of spam are correlated:

ggplot() + aes(spam[,34], spam[,32]) + geom_point()

If we find and plot the principal components of these variables:

prComp <- prcomp(x = spam[,c(34,32)] )

# plot the first 2 principal components
ggplot() + aes(prComp$x[,1], prComp$x[,2]) + geom_point()

Mostly flat line, there is almost no correlation between the first 2 principle components of num415 and num857.

What does the rotation matrix look like?

round( prComp$rotation, digits=4)
##           PC1     PC2
## num415 0.7081  0.7061
## num857 0.7061 -0.7081

7.2.2 Exampple: PCA with R::stats

typeColor <- ((spam$type=="spam")*1 + 1)
prComp <- prcomp(x = log10(spam[ , -58]+1) )  # Gaussify to correct data skew (log10(...))

ggplot() + aes(prComp$x[,1], prComp$x[,2], color = typeColor) + geom_point() +
        xlab("PC1") + ylab("PC2")

The principal components are pretty distinct.

7.2.3 Example: PCA with R::caret

During preprocessing (the preProcess() method), use method="pca"):

# log10(...) Gaussifies the input for better PCA results
preProc <- preProcess(log10(spam[,-58]+1), method="pca", pcaComp=2)

# the principal components matrix is actually created with predict()
spamPC <- predict(object = preProc, newdata = log10(spam[,-58]+1))

ggplot() + aes(spamPC[,1], spamPC[,2], color=typeColor) + geom_point() +
        xlab("PC1") + ylab("PC2")

Principal Components still pretty distinct.

7.2.4 Example: Fitting to Principal Components

From the previous example, spamPC is the training set, the first 2 principal components of kernlab::spam.

# scales & shifts for pca
preProc <- preProcess(log10(training[,-58]+1), method="pca", pcaComp=2)

# predict() actually does the PCA, returning the PCM
trainPC <- predict(object = preProc, newdata = log10(training[,-58]+1))

trainPC$type <- training$type
modelFit <- train(type ~., data=trainPC, method="glm")
# modelFit <- train(type ~., data = training, method = 'glm')
modelFit$results
##   parameter  Accuracy     Kappa  AccuracySD    KappaSD
## 1      none 0.9056476 0.8011514 0.005884655 0.01226668
# make some predictions with this model
testPC <- predict(preProc, log10(testing[,-58]+1))
confusionMatrix(testing$type, predict(object=modelFit, newdata=testPC))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     645   52
##    spam         73  380
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8719, 0.9087)
##     No Information Rate : 0.6243          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7705          
##  Mcnemar's Test P-Value : 0.07364         
##                                           
##             Sensitivity : 0.8983          
##             Specificity : 0.8796          
##          Pos Pred Value : 0.9254          
##          Neg Pred Value : 0.8389          
##              Prevalence : 0.6243          
##          Detection Rate : 0.5609          
##    Detection Prevalence : 0.6061          
##       Balanced Accuracy : 0.8890          
##                                           
##        'Positive' Class : nonspam         
## 

A simpler form for doing the same thing is

modelFit <- train(type ~ ., method="glm", preProcess="pca", data=training)
confusionMatrix(testing$type, predict(modelFit, testing))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     658   39
##    spam         50  403
##                                           
##                Accuracy : 0.9226          
##                  95% CI : (0.9056, 0.9374)
##     No Information Rate : 0.6157          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8372          
##  Mcnemar's Test P-Value : 0.2891          
##                                           
##             Sensitivity : 0.9294          
##             Specificity : 0.9118          
##          Pos Pred Value : 0.9440          
##          Neg Pred Value : 0.8896          
##              Prevalence : 0.6157          
##          Detection Rate : 0.5722          
##    Detection Prevalence : 0.6061          
##       Balanced Accuracy : 0.9206          
##                                           
##        'Positive' Class : nonspam         
## 

In conclusion, transform data & remove outliers before doing this

8 Predicting with Univariate Regression

Linear regressions are simple and easy, but limited to linear settings. With one variable (univariate), just create a line of form \(y = mx + b\). In most machine learning settings, the parameters \(m\) and \(b\) are called weights, and notated with a single symbol: \(\{w_0, w_1 \}\), \(\{\theta_0, \theta_1\}\), etc.

In R, univariate regression is done with caret::train() or stats::predict()

8.0.1 Example: Modeling Geyser Eruptions

In this example, 1 output (eruption duration) is modeled by 1 input (time between eruptions).

# data in caret::faithful
library(caret); data(faithful); library(ggplot2);

set.seed(333)

inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,]
testFaith <- faithful[-inTrain,]

head(trainFaith)
##   eruptions waiting
## 1     3.600      79
## 3     3.333      74
## 5     4.533      85
## 6     2.883      55
## 7     4.700      88
## 8     3.600      85
ggplot(data=trainFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
            geom_abline(slope = 0.07, intercept = -1.5, linetype="dashed") +
            xlab("Waiting") + ylab("Duration")

The relationship looks pretty linear, \(Duration \approx m \times Waiting + b\)


We can find the best fit line more precisely with stats::lm:

library(stats)

lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)
## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26990 -0.34789  0.03979  0.36589  1.05020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.792739   0.227869  -7.867 1.04e-12 ***
## waiting      0.073901   0.003148  23.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.8018 
## F-statistic:   551 on 1 and 135 DF,  p-value: < 2.2e-16

The slope and intercepts are in Coefficients:; slope = waiting 0.073901, intercept = (Intercept) -1.792739.

An easy way to access the parameters is:

# returns named vectors
lm1$coefficients   
# or
coef(lm1)
## (Intercept)     waiting 
## -1.79273941  0.07390058 
## (Intercept)     waiting 
## -1.79273941  0.07390058

Making a single prediction is simply:

# new imput = 80
coef(lm1)[1] + coef(lm1)[2]*80
## (Intercept) 
##    4.119307

Ignore the (Intercept) label. It’s a bug.

Single predictions are even easier with stats::predict:

# takes a linear model object, and data frame of new inputs
predict(object=lm1, data.frame(waiting=80) )
##        1 
## 4.119307

You can access all the predictions of this model with:

lm1$fitted[1:5]  # returns a numeric vector
##        1        3        5        6        7 
## 4.045407 3.675904 4.488810 2.271793 4.710512

This makes plotting easy:

ggplot(data=trainFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
            geom_line(aes(x=waiting, y=lm1$fitted), color="red", lwd=1.5) +
            xlab("Waiting") + ylab("Duration")

8.0.2 Exaple: Plot Predictions

We use stats::predict and the existing linear model, built on the training data. When predicting from the test set, we pass the test set into predict() as an argument.

library(gridExtra)
# training set
g1 <- ggplot(data=trainFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
            geom_line( aes( x=waiting, y=predict(lm1) ) ) +
            xlab("Waiting") + ylab("Duration") + ggtitle("Training Set")
print(g1)

# test set 
g2 <- ggplot(data=testFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
            geom_line( aes( x=waiting, y=predict(lm1, newdata = testFaith ) ) ) +
            xlab("Waiting") + ylab("Duration") + ggtitle("Test Set")

grid.arrange(g1, g2, ncol=2)

Note the differing arguments in ggplot( data = ) and y = predict(). The test fit is not quite as good, which makes sense.

8.0.3 Example: Training and Test Set Errors

8.0.3.1 MSE

# training
sum((lm1$fitted - trainFaith$eruptions)^2)/length(trainFaith$eruptions)
## [1] 0.2414883
# test -  substitute predict(...) for lm1$fitted
sum((  predict(lm1, newdata=testFaith) - trainFaith$eruptions)^2)/length(testFaith$eruptions)
## Warning in predict(lm1, newdata = testFaith) - trainFaith$eruptions: longer
## object length is not a multiple of shorter object length
## [1] 2.460094

8.0.3.2 RMSE

# training
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2)/length(trainFaith$eruptions))
## [1] 0.4914146
# test -  substitute predict(...) for lm1$fitted
sqrt(sum((  predict(lm1, newdata=testFaith) - trainFaith$eruptions)^2)/length(testFaith$eruptions))
## Warning in predict(lm1, newdata = testFaith) - trainFaith$eruptions: longer
## object length is not a multiple of shorter object length
## [1] 1.568469

8.0.4 Example: Plotting Prediction Intervals with R::stats

stats::predict() can calculate intervals automatically, if you pass the interval=" " argument. The resulting prediction object has 3 columns: {"fit", "lwr", "upr"} containing the best fit predictions, as well as the corresponding plus/minus one (??) standard deviation boundaries. Unfortunately, the columnes can’t be accessed with pred1$fit; they must be indexed via pred1[rows, "fit"].

pred1 <- predict(lm1, newdata=testFaith, interval="prediction")
ord <- order(testFaith$waiting)

ggplot(data=testFaith) + aes(x=waiting, y=eruptions) + geom_point(color="blue") +
    geom_line( aes(x=waiting[ord], y=pred1[ord,"lwr"]), color="red" ) +
    geom_line( aes(x=waiting[ord], y=pred1[ord,"fit"]), lwd=1.2) +
    geom_line( aes(x=waiting[ord], y=pred1[ord,"upr"]), color="red" ) 

8.0.5 Example: Plotting Prediction Intervals with R::caret

This method passes a caret linear model object to stats::predict(), which calculates the confidence interals.

# get a glm with caret::train()
modFit <- train(eruptions ~ waiting, data=trainFaith, method="lm")
summary(modFit$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26990 -0.34789  0.03979  0.36589  1.05020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.792739   0.227869  -7.867 1.04e-12 ***
## waiting      0.073901   0.003148  23.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.8018 
## F-statistic:   551 on 1 and 135 DF,  p-value: < 2.2e-16
# pass finalModel to stats::predict()
pred1 <- predict(modFit$finalModel, newdata=testFaith, interval="prediction")
ord <- order(testFaith$waiting)

# plot it!!!
ggplot(data=testFaith) + aes(x=waiting, y=eruptions) + geom_point(color="blue") +
    geom_line( aes(x=waiting[ord], y=pred1[ord,"lwr"]), color="red" ) +
    geom_line( aes(x=waiting[ord], y=pred1[ord,"fit"]), lwd=1.2) +
    geom_line( aes(x=waiting[ord], y=pred1[ord,"upr"]), color="red" ) 

9 Predicting with Multivariate Regression

Taking another look at the ISLR::Wage data set reveals multiple features that seem important.

library(ISLR); library(ggplot2); library(caret);
data(Wage)

# subset out the variable we're trying to predict
Wage <- subset(Wage, select= -c(logwage))
summary(Wage)
##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins  
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917  
##                                                           
##                                                           
##                                                           
##                                                           
##                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 

At this point, create training and test sets, just for the helluvit.

inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]

dim(training); dim(testing);
## [1] 2102   11
## [1] 898  11

To begin exploration, a feature plot reveals relationships between the 3 features and the output

# caret::featurePlot()
featurePlot(x=training[, c("age", "education", "jobclass")], y=training$wage, plot="pairs")

Each pair plot shows a mild trend with some outliers, suggesting that each feature may be used as a predictor.

Let’s take a closer look at age

# this first plot shows a VERY subtle trend
g1 <- ggplot(data=training) + aes(x=age, y=wage) + geom_point()

# we can plot a second feature by using color
g2 <- ggplot(data=training) + aes(x=age, y=wage, color=jobclass) + geom_point()

grid.arrange(ncol=2, g1, g2)

The gray plot shows that high wage correlates with middle age, but not much else. Adding color shows that information jobs skew higher wages, and account for most of the high wage outliers.

What about education?

g3 <- ggplot(data=training) + aes(x=age, y=wage, color=education) + geom_point()
grid.arrange(ncol=2, g2, g3)

Now we see that more education generally correlates with higher wage.

9.0.1 Example: Fit a Multivariate Linear Model

caret::train(formula= ) can take multiple features as x-arguments; it will then fit a model of the form

\[ y = b_0 + b_1*age + \sum_{j=1}^{J}b_2_j(jobclass_j) + \sum_{k=1}^{K}b_3_k(education_k) \]

modFit <- train(wage ~ age+jobclass+education, method="lm", data=training)

finMod <- modFit$finalModel
print(modFit)
## Linear Regression 
## 
## 2102 samples
##    3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
## Resampling results:
## 
##   RMSE      Rsquared 
##   36.06666  0.2517055
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
## 

Each iteration sampled all 2102 rows of the training set. The 3 features have been expanded to 10 predictors to account for the multiple classes of jobclass and education.

The base plot() function will iterate through all the pairwise plots in the multivariate model (there are 6).

plot(finMod, which = 1, cex=0.5, col="#00000010")

Outliers at top-left are labeled.

Sometimes, coloring by a variable not included in the model will show relationships.

ggplot(data=training) + aes(finMod$fitted, finMod$residuals, color=training$race) + geom_point()

The highest outliers are all black, hmmm. What might that suggest?

9.0.2 Example: Plotting Residuals v Index

Sometimes there will be systemic bias due to the row-ordering of a data set. Plotting residuals vs row-index will show this.

ggplot() + aes(x=1:2102, y=finMod$residuals) + geom_point()

That doesn’t seem to be the case here, but when there is a trend, it suggests that a variable is missing from the model.

9.0.3 Example: Plotting All Covariates

modFitAll <- train(wage ~ age+jobclass+education, method="lm", data=training)
pred <- predict(modFitAll, testing)

ggplot(data=testing) + aes(x=wage, y=pred) + geom_point()

This plot shows the predicted wage that corresponds to each actual wage.